Analysis of clone size, diversity in fate of barcoded progenitors from WT and G6PD-Tg mice

In this experiment we sorted MPPs from G6PD-Tg mice and from WT littermate controls (send from IMDEA madrid) and barcoded them with the LG2.2 library. Cells were transplanted into irradiated recipients and 3 weeks later the bone marrow was harvested, barcoded cells were sorted and lysed for barcode library sequencing.

Step 1: Set up the workspace

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = "/Users/jasoncosgrove/Dropbox (Team_Perie)/Jason/Experiments/Wet_lab/JCW24_G6PD_barcoding/LentiviralBarcoding_Analysis/inputs/")

#clear the workspace and load in the necessary packages
rm(list = ls())
library(cowplot)
library(plotrix)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
source('/Users/jasoncosgrove/Dropbox (Team_Perie)/Jason/Experiments/Wet_lab/JCW24_G6PD_barcoding/LentiviralBarcoding_Analysis/helper_methods_mature.R')

setwd("/Users/jasoncosgrove/Dropbox (Team_Perie)/Jason/Experiments/Wet_lab/JCW24_G6PD_barcoding/LentiviralBarcoding_Analysis/inputs/")
# Set output directory
output_fp <- "/Users/jasoncosgrove/Desktop/"

Step 2: Load the data

The data has been demultiplexed, QCd and filtered using xcalibr, and R scripts. Full pipeline available in the LentiviralBarcoding_Demultiplexing and LentiviralBarcoding_QC folders

#read in the QC'd and filtered data
wt.mat <- read.csv("JCW24WT_ANALYSIS_matx_norm_filt_cor_ab_trueFalseRU.csv")
tg.mat <- read.csv("JCW24TG_ANALYSIS_matx_norm_filt_cor_ab_trueFalseRU.csv")

Step 3: Sample-level normalisation

Population sizes vary dramatically between different cell types in the hematopoietic system and so a normalisation should be made to the data prior to making biological inferences. We perform a column-wise normalisation such that each column sums to 1. The value in each i,j position then represents the proportional abundance of barcode i in sample j.

# normalise the count matrices
wt.barcodes <- wt.mat$tag
tg.barcodes <- tg.mat$tag
wt.mat.norm <- columnNormalisation(wt.mat[,-1],wt.barcodes)
tg.mat.norm <- columnNormalisation(tg.mat[,-1],tg.barcodes)

#remove clones that dont appear in any sample and remove the mpps from the analysis
wt.mat.norm.filt <- removeUninformativeClones(wt.mat.norm[,!grepl("CKIT",colnames(wt.mat.norm))])
tg.mat.norm.filt <- removeUninformativeClones(tg.mat.norm[,!grepl("CKIT",colnames(tg.mat.norm))])

Step 4: Make a count matrix for each mouse

Split the matrix so that we can analyse each mouse independently

wt.m1<- getMouseMatrix(wt.mat.norm.filt,"M1.WT")
wt.m2<- getMouseMatrix(wt.mat.norm.filt,"M2.WT")
wt.m3<- getMouseMatrix(wt.mat.norm.filt,"M3.WT")
wt.m4<- getMouseMatrix(wt.mat.norm.filt,"M4.WT")
tg.m1<- getMouseMatrix(tg.mat.norm.filt,"M1.TG")
tg.m2<- getMouseMatrix(tg.mat.norm.filt,"M2.TG")
tg.m3<- getMouseMatrix(tg.mat.norm.filt,"M3.TG")
tg.m4<- getMouseMatrix(tg.mat.norm.filt,"M4.TG")

Step 5: cell count normalisation

The count matrix actually represents the number of sequencing reads measured for each barcode. But it is not the case that 1 read equals 1 cell because it depends on how many cells you had in the beginning, how much of your sample was sorted, and what is your sequencing depth. We record how many cells we sort and how much volume of our total sample we sort so we can use this information to convert reads to cell numbers.

# now normalise the read counts by cell counts
m1.wt.cellnorm <- cellcountNormalisation("M1-WT", wt.m1)
m2.wt.cellnorm <- cellcountNormalisation("M2-WT", wt.m2)
m3.wt.cellnorm <- cellcountNormalisation("M3-WT", wt.m3)
m4.wt.cellnorm <- cellcountNormalisation("M4-WT", wt.m4)
m1.tg.cellnorm <- cellcountNormalisation("M1-Tg", tg.m1)
m2.tg.cellnorm <- cellcountNormalisation("M2-Tg", tg.m2)
m3.tg.cellnorm <- cellcountNormalisation("M3-Tg", tg.m3)
m4.tg.cellnorm <- cellcountNormalisation("M4-Tg", tg.m4)

Step 6: for each sample make a meta-mouse merging individual mice into one representative sample for each experimental condition

Here we combine mice by each sample to create a metasample for each condition. This allows us to look globally at what the difference between the two conditions is

# merge the different mice into a single sample
wt.meta <- makeMetaMouse(wt.m1, wt.m2, wt.m3, wt.m4)
tg.meta <- makeMetaMouse(tg.m1, tg.m2, tg.m3, tg.m4)

# merge the different mice into a single sample
wt.meta.cellnorm <- makeMetaMouse(m1.wt.cellnorm,
                                  m2.wt.cellnorm,
                                  m3.wt.cellnorm,
                                  m4.wt.cellnorm)

tg.meta.cellnorm <- makeMetaMouse(m1.tg.cellnorm,
                                  m2.tg.cellnorm,
                                  m3.tg.cellnorm,
                                  m4.tg.cellnorm)

Step 7: Clone size distributions per sample

When analysing lineage tracing data one key metric is the clone size distributions. Here we analyse clone size distributions for the M, E and B lineages in the WT and Tg mice. We observe significant differences in clone size distributions for E and B when we group all mice together

boxplot(wt.meta$M,wt.meta$E, wt.meta$B,
        tg.meta$M,tg.meta$E, tg.meta$B , 
        outline = F,las = 2,ylab = "relative barcode abundance per lineage", main = "Clone Size Distributions Per Condition per Lineage",
        at =c(1,2,3, 5,6,7),col = c("red","red","red","blue","blue","blue"),
        names = c("WT - M", "WT - E","WT - B",
                  "Tg - M", "Tg - E","Tg - B"))

wilcox.test(wt.meta$M,tg.meta$M)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  wt.meta$M and tg.meta$M
## W = 234804, p-value = 0.5358
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(wt.meta$B,tg.meta$B)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  wt.meta$B and tg.meta$B
## W = 222583, p-value = 0.0233
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(wt.meta$E,tg.meta$E)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  wt.meta$E and tg.meta$E
## W = 223406, p-value = 0.03011
## alternative hypothesis: true location shift is not equal to 0
#clone size distributions normalised by cell count
boxplot(wt.meta.cellnorm$M,wt.meta.cellnorm$E, wt.meta.cellnorm$B,
        tg.meta.cellnorm$M,tg.meta.cellnorm$E, tg.meta.cellnorm$B , 
        outline = F,
        las = 2,ylab = "cells per barcode per lineage", main = "Clone Size Distributions in Cell Counts per Condition per Lineage",
        at =c(1,2,3, 5,6,7),col = c("red","red","red","blue","blue","blue"),
        names = c("WT - M", "WT - E","WT - B",
                  "Tg - M", "Tg - E","Tg - B"))

wilcox.test(wt.meta.cellnorm$M,tg.meta.cellnorm$M)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  wt.meta.cellnorm$M and tg.meta.cellnorm$M
## W = 226516, p-value = 0.08257
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(wt.meta.cellnorm$B,tg.meta.cellnorm$B)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  wt.meta.cellnorm$B and tg.meta.cellnorm$B
## W = 237815, p-value = 0.8307
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(wt.meta.cellnorm$E,tg.meta.cellnorm$E)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  wt.meta.cellnorm$E and tg.meta.cellnorm$E
## W = 218966, p-value = 0.005592
## alternative hypothesis: true location shift is not equal to 0

Step 8: Clone size distributions per mouse

Here we plot clone size distributions for each lineage for each mouse

boxplot(wt.m1$M,wt.m1$E, wt.m1$B,
        wt.m2$M,wt.m2$E, wt.m2$B,
        wt.m3$M,wt.m3$E, wt.m3$B,
        wt.m4$M,wt.m4$E, wt.m4$B,
        tg.m1$M,tg.m1$E, tg.m1$B , 
        tg.m2$M,tg.m2$E, tg.m2$B , 
        tg.m3$M,tg.m3$E, tg.m3$B , 
        tg.m4$M,tg.m4$E, tg.m4$B , 
        outline = F,las = 2,ylab = "relative barcode abundance", 
        main = "Clone Size Distributions Per Condition per Mouse",
        col = c(rep("red",12),rep("blue",12)),
        names = c("WT1 - M", "WT1 - E","WT1 - B",
                  "WT2 - M", "WT2 - E","WT2 - B",
                  "WT3 - M", "WT3 - E","WT3 - B",
                  "WT4 - M", "WT4 - E","WT4 - B",
                  "Tg1 - M", "Tg1 - E","Tg1 - B",
                  "Tg2 - M", "Tg2 - E","Tg2 - B",
                  "Tg3 - M", "Tg3 - E","Tg3 - B",
                  "Tg4 - M", "Tg4 - E","Tg4 - B"))

#Plot the median Myeloid clone size for each mouse, comparing WT and Tg
plotCloneSizes(wt.m1,wt.m2,wt.m3,wt.m4,
               tg.m1,tg.m2,tg.m3,tg.m4,"M")

plotCloneSizes(wt.m1,wt.m2,wt.m3,wt.m4,
               tg.m1,tg.m2,tg.m3,tg.m4,"B")

plotCloneSizes(wt.m1,wt.m2,wt.m3,wt.m4,
               tg.m1,tg.m2,tg.m3,tg.m4,"E")

boxplot(m1.wt.cellnorm$M,m1.wt.cellnorm$E, m1.wt.cellnorm$B,
        m2.wt.cellnorm$M,m2.wt.cellnorm$E, m2.wt.cellnorm$B,
        m3.wt.cellnorm$M,m3.wt.cellnorm$E, m3.wt.cellnorm$B,
        m4.wt.cellnorm$M,m4.wt.cellnorm$E, m4.wt.cellnorm$B,
        m1.tg.cellnorm$M,m1.tg.cellnorm$E, m1.tg.cellnorm$B , 
        m2.tg.cellnorm$M,m2.tg.cellnorm$E, m2.tg.cellnorm$B , 
        m3.tg.cellnorm$M,m3.tg.cellnorm$E, m3.tg.cellnorm$B , 
        m4.tg.cellnorm$M,m4.tg.cellnorm$E, m4.tg.cellnorm$B , 
        outline = F,las = 2,ylab = "relative barcode abundance", 
        main = "Clone Size Distributions Per Condition per Mouse",
        col = c(rep("red",12),rep("blue",12)),
        names = c("WT1 - M", "WT1 - E","WT1 - B",
                  "WT2 - M", "WT2 - E","WT2 - B",
                  "WT3 - M", "WT3 - E","WT3 - B",
                  "WT4 - M", "WT4 - E","WT4 - B",
                  "Tg1 - M", "Tg1 - E","Tg1 - B",
                  "Tg2 - M", "Tg2 - E","Tg2 - B",
                  "Tg3 - M", "Tg3 - E","Tg3 - B",
                  "Tg4 - M", "Tg4 - E","Tg4 - B"))

plotCloneSizes(m1.wt.cellnorm,m2.wt.cellnorm,m3.wt.cellnorm,m4.wt.cellnorm,
               m1.tg.cellnorm,m2.tg.cellnorm,m3.tg.cellnorm,m4.tg.cellnorm,"M")

plotCloneSizes(m1.wt.cellnorm,m2.wt.cellnorm,m3.wt.cellnorm,m4.wt.cellnorm,
               m1.tg.cellnorm,m2.tg.cellnorm,m3.tg.cellnorm,m4.tg.cellnorm,"B")

plotCloneSizes(m1.wt.cellnorm,m2.wt.cellnorm,m3.wt.cellnorm,m4.wt.cellnorm,
               m1.tg.cellnorm,m2.tg.cellnorm,m3.tg.cellnorm,m4.tg.cellnorm,"E")

Step 9: Clone diversity in the meta mouse

Aside from clone size distributions we can also look at how many clones there are in each celltype. This gives an indication of how many MPPs contribute to each cell lineage

barplot(c(wt.meta %>% filter(M > 0) %>% nrow(),
        tg.meta %>% filter(M > 0) %>% nrow(),
        wt.meta %>% filter(E > 0) %>% nrow(),
        tg.meta %>% filter(E > 0) %>% nrow(),
        wt.meta %>% filter(B > 0) %>% nrow(),
        tg.meta %>% filter(B > 0) %>% nrow()),
        names = c("WT-M","Tg-M", 
                  "WT-E","Tg-E",
                  "WT-B","Tg-B"),
        col = c("red","blue","red","blue","red","blue"), 
        main = "clonal diversity")

Step 10: Clone diversity between mice

Here we how the same as above but show mouse to mouse variability. We observe no significant differences between WT and Tg in this setting

#compute the total clonal diversity in each setting
wt.m <- c(table(wt.m1$M > 0)[[2]],
          table(wt.m2$M > 0)[[2]],
          table(wt.m3$M > 0)[[2]],
          table(wt.m4$M > 0)[[2]])

tg.m <- c(table(tg.m1$M > 0)[[2]],
          table(tg.m2$M > 0)[[2]],
          table(tg.m3$M > 0)[[2]],
          table(tg.m4$M > 0)[[2]])

boxplot(wt.m,tg.m, main = "Comparison of clone size distributions between WT and Tg in Myeloid",
        ylab = "total number of clones per mouse",names = c("WT","Tg"), col = c("blue","red"))

wt.e <- c(table(wt.m1$E > 0)[[2]],
          table(wt.m2$E > 0)[[2]],
          table(wt.m3$E > 0)[[2]],
          table(wt.m4$E > 0)[[2]])

tg.e <- c(table(tg.m1$E > 0)[[2]],
          table(tg.m2$E > 0)[[2]],
          table(tg.m3$E > 0)[[2]],
          table(tg.m4$E > 0)[[2]])

boxplot(wt.e,tg.e, main = "Comparison of clone size distributions between WT and Tg in Erythroid",
        ylab = "total number of clones per mouse",names = c("WT","Tg"), col = c("blue","red"))

wt.b <- c(table(wt.m1$B > 0)[[2]],
          table(wt.m2$B > 0)[[2]],
          table(wt.m3$B > 0)[[2]],
          table(wt.m4$B > 0)[[2]])

tg.b <- c(table(tg.m1$B > 0)[[2]],
          table(tg.m2$B > 0)[[2]],
          table(tg.m3$B > 0)[[2]],
          table(tg.m4$B > 0)[[2]])

boxplot(wt.b,tg.b, main = "Comparison of clone size distributions between WT and Tg in B",
        ylab = "total number of clones per mouse",names = c("WT","Tg"), col = c("blue","red"))

Step 11: M vs E bias in the meta mouse

Now we have analysed differences in clone size distributions and clonal diversity we can look at barcode sharing across lineages. First we will compare 2 lineages at a time using a simple scatter plot

qplot(log(wt.meta.cellnorm$M+ 1) ,log(wt.meta.cellnorm$E + 1),main = "WT M vs E with cell normalisation", xlim = c(0,11),ylim = c(0,11)) + geom_abline(intercept = 0,slope = 1, col = 'grey30') +  geom_point(size=1)

qplot(log(tg.meta.cellnorm$M+ 1) ,log(tg.meta.cellnorm$E + 1),main = "Tg M vs E with cell normalisation" , xlim = c(0,11),ylim = c(0,11)) + geom_abline(intercept = 0,slope = 1, col = 'grey30') +  geom_point(size=1)

Step 11: M vs B bias in the meta mouse

qplot(log(wt.meta.cellnorm$M+ 1) ,log(wt.meta.cellnorm$B + 1),main = "WT M vs B with cell normalisation", xlim = c(0,12),ylim = c(0,12)) + geom_abline(intercept = 0,slope = 1, col = 'grey30') +  geom_point(size=1)

qplot(log(tg.meta.cellnorm$M+ 1) ,log(tg.meta.cellnorm$B + 1),main = "Tg M vs B with cell normalisation" , xlim = c(0,12),ylim = c(0,12)) + geom_abline(intercept = 0,slope = 1, col = 'grey30') +  geom_point(size=1)

Step 11: B vs E bias in the meta mouse

qplot(log(wt.meta.cellnorm$B+ 1) ,log(wt.meta.cellnorm$E + 1),main = "WT B vs E with cell normalisation", xlim = c(0,12),ylim = c(0,12)) + geom_abline(intercept = 0,slope = 1, col = 'grey30') +  geom_point(size=1)

qplot(log(tg.meta.cellnorm$B+ 1) ,log(tg.meta.cellnorm$E + 1),main = "Tg B vs E with cell normalisation" , xlim = c(0,12),ylim = c(0,12)) + geom_abline(intercept = 0,slope = 1, col = 'grey30') +  geom_point(size=1)

Step 12: Heatmaps

We can also analyse barcode sharing by heatmap visualisation and unsupervised hierarchical clustering. We do not observe much striking differences between the WT and Tg

#heatmap with relative clone abundances
heatmap(log(as.matrix(wt.meta[,c("B","M","E")] + 1)) ,col=colorRampPalette(c("black","green","red"),bias = 2)(300),scale = "none",distfun = function(x) dist(x,method = 'euclidean'))

heatmap(log(as.matrix(tg.meta[,c("B","M","E")] + 1)) ,col=colorRampPalette(c("black","green","red"),bias = 2)(300),scale = "none",distfun = function(x) dist(x,method = 'euclidean'))

#heatmap with cell-count normalisation
heatmap(log(as.matrix(wt.meta.cellnorm[,c("B","M","E")] + 1)) ,col=colorRampPalette(c("black","green","red"),bias = 2)(300),scale = "none",distfun = function(x) dist(x,method = 'euclidean'))

heatmap(log(as.matrix(tg.meta.cellnorm[,c("B","M","E")] + 1)) ,col=colorRampPalette(c("black","green","red"),bias = 2)(300),scale = "none",distfun = function(x) dist(x,method = 'euclidean'))

Step 13: Ternary Plots

Another way to look at biases across 3 lineages is to do a ternary plot. We dont observe any striking differences between WT and Tg using this visualisation.

#row-wise normalisations
tg.meta.norm <- performRowNormalisation(tg.meta)
wt.meta.norm <- performRowNormalisation(wt.meta)
tg.meta.cellnorm.norm <- performRowNormalisation(tg.meta.cellnorm)
wt.meta.cellnorm.norm <- performRowNormalisation(wt.meta.cellnorm)

triax.plot(wt.meta.norm, col.symbols = "red",show.grid = T,pch = 16)

triax.plot(tg.meta.norm, col.symbols = "blue",show.grid = T,pch = 16)

triax.plot(wt.meta.cellnorm.norm, col.symbols = "red",show.grid = F,pch = 16)

triax.plot(tg.meta.cellnorm.norm, col.symbols = "blue",show.grid = F,pch = 16)

Step 14: Multi-outcome classifier with 10% threshold

threshold <- 0.1

barplot(multiOutcomeClassifier(wt.meta.norm, threshold = threshold)$table,ylim = c(0,300), main = "WT")

barplot(multiOutcomeClassifier(tg.meta.norm, threshold = threshold)$table,ylim = c(0,300), main = "Tg")

barplot(multiOutcomeClassifier(wt.meta.cellnorm.norm, threshold = threshold)$table,ylim = c(0,300), main = "WT cellnorm")

barplot(multiOutcomeClassifier(tg.meta.cellnorm.norm, threshold = threshold)$table,ylim = c(0,300), main = "Tg cellnorm")

m1.wt.cellnorm.norm <- performRowNormalisation(m1.wt.cellnorm)
barplot(multiOutcomeClassifier(m1.wt.cellnorm.norm, threshold = threshold)$table, main = "M1 WT" ,ylim = c(0,100))

m2.wt.cellnorm.norm <- performRowNormalisation(m2.wt.cellnorm)
barplot(multiOutcomeClassifier(m2.wt.cellnorm.norm, threshold = threshold)$table ,main = "M2 WT",ylim = c(0,100))

m3.wt.cellnorm.norm <- performRowNormalisation(m3.wt.cellnorm)
barplot(multiOutcomeClassifier(m3.wt.cellnorm.norm, threshold = threshold)$table,main = "M3 WT",ylim = c(0,100))

m4.wt.cellnorm.norm <- performRowNormalisation(m4.wt.cellnorm)
barplot(multiOutcomeClassifier(m4.wt.cellnorm.norm, threshold = threshold)$table,main = "M4 WT",ylim = c(0,100))

m1.tg.cellnorm.norm <- performRowNormalisation(m1.tg.cellnorm)
barplot(multiOutcomeClassifier(m1.tg.cellnorm.norm, threshold = threshold)$table ,main = "M1 Tg",ylim = c(0,100))

m2.tg.cellnorm.norm <- performRowNormalisation(m2.tg.cellnorm)
barplot(multiOutcomeClassifier(m2.tg.cellnorm.norm, threshold = threshold)$table ,main = "M2 Tg",ylim = c(0,100))

m3.tg.cellnorm.norm <- performRowNormalisation(m3.tg.cellnorm)
barplot(multiOutcomeClassifier(m3.tg.cellnorm.norm, threshold = threshold)$table,main = "M3 Tg",ylim = c(0,100))

m4.tg.cellnorm.norm <- performRowNormalisation(m4.tg.cellnorm)
barplot(multiOutcomeClassifier(m4.tg.cellnorm.norm, threshold = threshold)$table,main = "M4 Tg",ylim = c(0,100))

plotClassifierLineage("B",threshold)
## Warning in wilcox.test.default(wt, tg): cannot compute exact p-value with ties

plotClassifierLineage("BE",threshold)
## Warning in wilcox.test.default(wt, tg): cannot compute exact p-value with ties

plotClassifierLineage("BM",threshold)
## Warning in wilcox.test.default(wt, tg): cannot compute exact p-value with ties

plotClassifierLineage("BME",threshold)
## Warning in wilcox.test.default(wt, tg): cannot compute exact p-value with ties

plotClassifierLineage("E",threshold)

plotClassifierLineage("M",threshold)
## Warning in wilcox.test.default(wt, tg): cannot compute exact p-value with ties

plotClassifierLineage("ME",threshold)

When we sorted our cells we noticed that there were less B-cells that expressed GFP. Lets to a focused analysis to understand why this is happening

wt.cumsum <- generateCumulativeProbability(wt.meta.cellnorm,"B")
tg.cumsum <- generateCumulativeProbability(tg.meta.cellnorm,"B")
plot(wt.cumsum$cell_counts,wt.cumsum$cumsum,col = "red",lwd = 1,
     ylim = c(0,5e5))
points(tg.cumsum$cell_counts,tg.cumsum$cumsum,col = "blue",lwd = 1)

wt.cumsum <- generateCumulativeProbabilityNorm(wt.meta.cellnorm,"B")
tg.cumsum <- generateCumulativeProbabilityNorm(tg.meta.cellnorm,"B")
plot(1:nrow(wt.cumsum)/nrow(wt.cumsum),wt.cumsum$cumsum,col = "red",lwd = 1,xlab = "% of all clones", ylab = "cumulative cellular output")
points(1:nrow(tg.cumsum)/nrow(tg.cumsum),tg.cumsum$cumsum,col = "blue",lwd = 1)

m1.wt <- generateCumulativeProbabilityNorm(m1.wt.cellnorm,"B")
m2.wt <- generateCumulativeProbabilityNorm(m2.wt.cellnorm,"B")
m3.wt <- generateCumulativeProbabilityNorm(m3.wt.cellnorm,"B")
m4.wt <- generateCumulativeProbabilityNorm(m4.wt.cellnorm,"B")

m1.tg <- generateCumulativeProbabilityNorm(m1.tg.cellnorm,"B")
m2.tg <- generateCumulativeProbabilityNorm(m2.tg.cellnorm,"B")
m3.tg <- generateCumulativeProbabilityNorm(m3.tg.cellnorm,"B")
m4.tg <- generateCumulativeProbabilityNorm(m4.tg.cellnorm,"B")


plot(1:nrow(m1.wt)/nrow(m1.wt),m1.wt$cumsum,col = "red",lwd = 1,xlim = c(0,1),
     xlab = "% of all clones", ylab = "cumulative number of cells")
points(1:nrow(m2.wt)/nrow(m2.wt),m2.wt$cumsum,col = "orange",lwd = 1)
points(1:nrow(m3.wt)/nrow(m3.wt),m3.wt$cumsum,col = "dark red",lwd = 1)
points(1:nrow(m4.wt)/nrow(m4.wt),m4.wt$cumsum,col = "yellow",lwd = 1)
points(1:nrow(m1.tg)/nrow(m1.tg),m1.tg$cumsum,col = "blue",lwd = 1)
points(1:nrow(m2.tg)/nrow(m2.tg),m2.tg$cumsum,col = "dark blue",lwd = 1)
points(1:nrow(m3.tg)/nrow(m3.tg),m3.tg$cumsum,col = "green",lwd = 1)
points(1:nrow(m4.tg)/nrow(m4.tg),m4.tg$cumsum,col = "dark green",lwd = 1)
abline(h = 0.05, col = 'red')
text(0.2,0.05,"threshold for high output clones",srt=0.1,pos=3, col = "red",cex = 0.7)

wt.highoutputclones <- c(
    m1.wt %>% filter(cumsum > 0.05) %>% nrow(),
    m2.wt %>% filter(cumsum > 0.05) %>% nrow(),
    m3.wt %>% filter(cumsum > 0.05) %>% nrow(),
    m4.wt %>% filter(cumsum > 0.05) %>% nrow())


tg.highoutputclones <- c(
    m1.tg %>% filter(cumsum > 0.05) %>% nrow(),
    m2.tg %>% filter(cumsum > 0.05) %>% nrow(),
    m3.tg %>% filter(cumsum > 0.05) %>% nrow(),
    m4.tg %>% filter(cumsum > 0.05) %>% nrow())


boxplot(wt.highoutputclones,
        tg.highoutputclones ,outline = F,
        main = "High output clones in B: clonal diversity",
        ylab = "total number of clones per mouse",names = c("WT","Tg"), col = c("red","blue"))

#data is normally distributed
shapiro.test(c(wt.highoutputclones,
               tg.highoutputclones))
## 
##  Shapiro-Wilk normality test
## 
## data:  c(wt.highoutputclones, tg.highoutputclones)
## W = 0.97025, p-value = 0.9
t.test(wt.highoutputclones,tg.highoutputclones)
## 
##  Welch Two Sample t-test
## 
## data:  wt.highoutputclones and tg.highoutputclones
## t = -2.4814, df = 5.8167, p-value = 0.04897
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -28.41003311  -0.08996689
## sample estimates:
## mean of x mean of y 
##     20.25     34.50
#plot the clone size distributions for both conditions
wt.allmice <- unlist(c(m1.wt %>% filter(cumsum > 0.05) %>% select(cell_counts),
      m2.wt %>% filter(cumsum > 0.05) %>% select(cell_counts),
      m2.wt %>% filter(cumsum > 0.05) %>% select(cell_counts),
      m2.wt %>% filter(cumsum > 0.05) %>% select(cell_counts)))


tg.allmice <- unlist(c(m1.tg %>% filter(cumsum > 0.05) %>% select(cell_counts),
      m2.tg %>% filter(cumsum > 0.05) %>% select(cell_counts),
      m2.tg %>% filter(cumsum > 0.05) %>% select(cell_counts),
      m2.tg %>% filter(cumsum > 0.05) %>% select(cell_counts)))


boxplot(wt.allmice,
        tg.allmice ,outline = F,
        main = "High output clones in B: clone sizes",
        ylab = "total number of clones per mouse",names = c("WT","Tg"), col = c("red","blue"))

shapiro.test(c(wt.allmice,
               tg.allmice))
## 
##  Shapiro-Wilk normality test
## 
## data:  c(wt.allmice, tg.allmice)
## W = 0.56225, p-value < 2.2e-16
wilcox.test(wt.allmice,tg.allmice)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  wt.allmice and tg.allmice
## W = 8454, p-value = 4.031e-09
## alternative hypothesis: true location shift is not equal to 0
write.csv(wt.allmice,"/Users/jasoncosgrove/Desktop/wt_clonesize.csv")
write.csv(tg.allmice,"/Users/jasoncosgrove/Desktop/tg_clonesize.csv")

#now show the median clone size for each mouse
wt.highoutputclones <- unlist(c(
    m1.wt %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m2.wt %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m3.wt %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m4.wt %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts))))


tg.highoutputclones <- unlist(c(
    m1.tg %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m2.tg %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m3.tg %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m4.tg %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts))))

#data is normally distributed




boxplot(wt.highoutputclones,
        tg.highoutputclones ,outline = F,
        main = "High output clones in B: clone sizes",
        ylab = "total number of clones per mouse",names = c("WT","Tg"), col = c("red","blue"))

shapiro.test(c(wt.highoutputclones,
               tg.highoutputclones))
## 
##  Shapiro-Wilk normality test
## 
## data:  c(wt.highoutputclones, tg.highoutputclones)
## W = 0.94401, p-value = 0.6509
t.test(wt.highoutputclones,tg.highoutputclones)
## 
##  Welch Two Sample t-test
## 
## data:  wt.highoutputclones and tg.highoutputclones
## t = 2.6073, df = 5.0571, p-value = 0.04731
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##    18.50125 2105.34364
## sample estimates:
## mean of x mean of y 
## 1692.0576  630.1351
wt.clones <- c(m1.wt %>% filter(cumsum > 0.05) %>% rownames,
               m2.wt %>% filter(cumsum > 0.05) %>% rownames,
               m3.wt %>% filter(cumsum > 0.05) %>% rownames,
               m4.wt %>% filter(cumsum > 0.05) %>% rownames)


tg.clones <- c(m1.tg %>% filter(cumsum > 0.05) %>% rownames,
               m2.tg %>% filter(cumsum > 0.05) %>% rownames,
               m3.tg %>% filter(cumsum > 0.05) %>% rownames,
               m4.tg %>% filter(cumsum > 0.05) %>% rownames)





wt.subset <- wt.meta.cellnorm[wt.clones,]
tg.subset <- tg.meta.cellnorm[tg.clones,]
wt.subset.rownorm <- wt.meta.cellnorm.norm[wt.clones,]
tg.subset.rownorm <- tg.meta.cellnorm.norm[tg.clones,]


#heatmap with cell-count normalisation
heatmap(log(as.matrix(wt.subset[,c("B","M","E")] + 1)) ,col=colorRampPalette(c("black","green","red"),bias = 2)(300),scale = "none",distfun = function(x) dist(x,method = 'euclidean'))

heatmap(log(as.matrix(tg.subset[,c("B","M","E")] + 1)) ,col=colorRampPalette(c("black","green","red"),bias = 2)(300),scale = "none",distfun = function(x) dist(x,method = 'euclidean'))

boxplot(wt.subset.rownorm$B, tg.subset.rownorm$B,
        wt.subset.rownorm$M, tg.subset.rownorm$M,
        wt.subset.rownorm$E, tg.subset.rownorm$E)

write.csv(wt.subset.rownorm,"/Users/jasoncosgrove/Desktop/wt_rownorm.csv")
write.csv(tg.subset.rownorm,"/Users/jasoncosgrove/Desktop/tg_rownorm.csv")

wilcox.test(wt.subset.rownorm$B, tg.subset.rownorm$B)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  wt.subset.rownorm$B and tg.subset.rownorm$B
## W = 7056, p-value = 0.001197
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(wt.subset.rownorm$M, tg.subset.rownorm$M)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  wt.subset.rownorm$M and tg.subset.rownorm$M
## W = 4436, p-value = 0.0109
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(wt.subset.rownorm$E, tg.subset.rownorm$E)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  wt.subset.rownorm$E and tg.subset.rownorm$E
## W = 3537, p-value = 5.849e-06
## alternative hypothesis: true location shift is not equal to 0
#heatmap with cell-count normalisation
heatmap(log(as.matrix(wt.subset[,c("B","M","E")] + 1)) ,col=colorRampPalette(c("black","green","red"),bias = 2)(300),scale = "none",distfun = function(x) dist(x,method = 'euclidean'))

heatmap(log(as.matrix(tg.subset[,c("B","M","E")] + 1)) ,col=colorRampPalette(c("black","green","red"),bias = 2)(300),scale = "none",distfun = function(x) dist(x,method = 'euclidean'))

combined.mat <- rbind(wt.subset,tg.subset)

new.mat <- matrix(0,nrow = nrow(combined.mat),ncol = 6)
rownames(new.mat) <- row.names(combined.mat)

colnames(new.mat) <- c("WT-B", "WT-M","WT-E",
                       "Tg-B", "Tg-M","Tg-E")

new.mat <- data.frame(new.mat)
new.mat[1:nrow(wt.subset),1:3] <- wt.subset[,1:3]

new.mat[82:219,4:6] <- tg.subset[,1:3]



heatmap(log(as.matrix(new.mat + 1)) ,col=colorRampPalette(c("black","green","red"),bias = 2)(300),scale = "none",distfun = function(x) dist(x,method = 'euclidean'))

#lets run the same analysis in the other lineages to see if it’s really a B-cell specific phenomenon

wt.cumsum <- generateCumulativeProbability(wt.meta.cellnorm,"M")
tg.cumsum <- generateCumulativeProbability(tg.meta.cellnorm,"M")
plot(wt.cumsum$cell_counts,wt.cumsum$cumsum,col = "red",lwd = 1,
     ylim = c(0,5e5))
points(tg.cumsum$cell_counts,tg.cumsum$cumsum,col = "blue",lwd = 1)

wt.cumsum <- generateCumulativeProbabilityNorm(wt.meta.cellnorm,"M")
tg.cumsum <- generateCumulativeProbabilityNorm(tg.meta.cellnorm,"M")
plot(1:nrow(wt.cumsum)/nrow(wt.cumsum),wt.cumsum$cumsum,col = "red",lwd = 1,xlab = "% of all clones", ylab = "cumulative cellular output")
points(1:nrow(tg.cumsum)/nrow(tg.cumsum),tg.cumsum$cumsum,col = "blue",lwd = 1)

m1.wt <- generateCumulativeProbabilityNorm(m1.wt.cellnorm,"M")
m2.wt <- generateCumulativeProbabilityNorm(m2.wt.cellnorm,"M")
m3.wt <- generateCumulativeProbabilityNorm(m3.wt.cellnorm,"M")
m4.wt <- generateCumulativeProbabilityNorm(m4.wt.cellnorm,"M")

m1.tg <- generateCumulativeProbabilityNorm(m1.tg.cellnorm,"M")
m2.tg <- generateCumulativeProbabilityNorm(m2.tg.cellnorm,"M")
m3.tg <- generateCumulativeProbabilityNorm(m3.tg.cellnorm,"M")
m4.tg <- generateCumulativeProbabilityNorm(m4.tg.cellnorm,"M")


plot(1:nrow(m1.wt)/nrow(m1.wt),m1.wt$cumsum,col = "red",lwd = 1,xlim = c(0,1),
     xlab = "% of all clones", ylab = "cumulative number of cells")
points(1:nrow(m2.wt)/nrow(m2.wt),m2.wt$cumsum,col = "orange",lwd = 1)
points(1:nrow(m3.wt)/nrow(m3.wt),m3.wt$cumsum,col = "dark red",lwd = 1)
points(1:nrow(m4.wt)/nrow(m4.wt),m4.wt$cumsum,col = "yellow",lwd = 1)
points(1:nrow(m1.tg)/nrow(m1.tg),m1.tg$cumsum,col = "blue",lwd = 1)
points(1:nrow(m2.tg)/nrow(m2.tg),m2.tg$cumsum,col = "dark blue",lwd = 1)
points(1:nrow(m3.tg)/nrow(m3.tg),m3.tg$cumsum,col = "green",lwd = 1)
points(1:nrow(m4.tg)/nrow(m4.tg),m4.tg$cumsum,col = "dark green",lwd = 1)
abline(h = 0.05, col = 'red')
text(0.2,0.05,"threshold for high output clones",srt=0.1,pos=3, col = "red",cex = 0.7)

wt.highoutputclones <- c(
    m1.wt %>% filter(cumsum > 0.05) %>% nrow(),
    m2.wt %>% filter(cumsum > 0.05) %>% nrow(),
    m3.wt %>% filter(cumsum > 0.05) %>% nrow(),
    m4.wt %>% filter(cumsum > 0.05) %>% nrow())


tg.highoutputclones <- c(
    m1.tg %>% filter(cumsum > 0.05) %>% nrow(),
    m2.tg %>% filter(cumsum > 0.05) %>% nrow(),
    m3.tg %>% filter(cumsum > 0.05) %>% nrow(),
    m4.tg %>% filter(cumsum > 0.05) %>% nrow())


boxplot(wt.highoutputclones,
        tg.highoutputclones ,outline = F,
        main = "High output clones in M: clonal diversity",
        ylab = "total number of clones per mouse",names = c("WT","Tg"), col = c("red","blue"))

#data is normally distributed
shapiro.test(c(wt.highoutputclones,
               tg.highoutputclones))
## 
##  Shapiro-Wilk normality test
## 
## data:  c(wt.highoutputclones, tg.highoutputclones)
## W = 0.92559, p-value = 0.4768
t.test(wt.highoutputclones,tg.highoutputclones)
## 
##  Welch Two Sample t-test
## 
## data:  wt.highoutputclones and tg.highoutputclones
## t = 0.28664, df = 5.441, p-value = 0.785
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -34.89138  43.89138
## sample estimates:
## mean of x mean of y 
##     57.25     52.75
wt.highoutputclones <- unlist(c(
    m1.wt %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m2.wt %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m3.wt %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m4.wt %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts))))


tg.highoutputclones <- unlist(c(
    m1.tg %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m2.tg %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m3.tg %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m4.tg %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts))))

#data is normally distributed

boxplot(wt.highoutputclones,
        tg.highoutputclones ,outline = F,
        main = "High output clones in M: clone sizes",
        ylab = "total number of clones per mouse",names = c("WT","Tg"), col = c("red","blue"))

shapiro.test(c(wt.highoutputclones,
               tg.highoutputclones))
## 
##  Shapiro-Wilk normality test
## 
## data:  c(wt.highoutputclones, tg.highoutputclones)
## W = 0.85835, p-value = 0.1156
t.test(wt.highoutputclones,tg.highoutputclones)
## 
##  Welch Two Sample t-test
## 
## data:  wt.highoutputclones and tg.highoutputclones
## t = 0.17867, df = 5.5682, p-value = 0.8645
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1495.472  1726.307
## sample estimates:
## mean of x mean of y 
##  1329.318  1213.901

#lets run the same analysis in the other lineages to see if it’s really a B-cell specific phenomenon

wt.cumsum <- generateCumulativeProbability(wt.meta.cellnorm,"E")
tg.cumsum <- generateCumulativeProbability(tg.meta.cellnorm,"E")
plot(wt.cumsum$cell_counts,wt.cumsum$cumsum,col = "red",lwd = 1,
     ylim = c(0,5e5))
points(tg.cumsum$cell_counts,tg.cumsum$cumsum,col = "blue",lwd = 1)

wt.cumsum <- generateCumulativeProbabilityNorm(wt.meta.cellnorm,"E")
tg.cumsum <- generateCumulativeProbabilityNorm(tg.meta.cellnorm,"E")
plot(1:nrow(wt.cumsum)/nrow(wt.cumsum),wt.cumsum$cumsum,col = "red",lwd = 1,xlab = "% of all clones", ylab = "cumulative cellular output")
points(1:nrow(tg.cumsum)/nrow(tg.cumsum),tg.cumsum$cumsum,col = "blue",lwd = 1)

m1.wt <- generateCumulativeProbabilityNorm(m1.wt.cellnorm,"E")
m2.wt <- generateCumulativeProbabilityNorm(m2.wt.cellnorm,"E")
m3.wt <- generateCumulativeProbabilityNorm(m3.wt.cellnorm,"E")
m4.wt <- generateCumulativeProbabilityNorm(m4.wt.cellnorm,"E")

m1.tg <- generateCumulativeProbabilityNorm(m1.tg.cellnorm,"E")
m2.tg <- generateCumulativeProbabilityNorm(m2.tg.cellnorm,"E")
m3.tg <- generateCumulativeProbabilityNorm(m3.tg.cellnorm,"E")
m4.tg <- generateCumulativeProbabilityNorm(m4.tg.cellnorm,"E")


plot(1:nrow(m1.wt)/nrow(m1.wt),m1.wt$cumsum,col = "red",lwd = 1,xlim = c(0,1),
     xlab = "% of all clones", ylab = "cumulative number of cells")
points(1:nrow(m2.wt)/nrow(m2.wt),m2.wt$cumsum,col = "orange",lwd = 1)
points(1:nrow(m3.wt)/nrow(m3.wt),m3.wt$cumsum,col = "dark red",lwd = 1)
points(1:nrow(m4.wt)/nrow(m4.wt),m4.wt$cumsum,col = "yellow",lwd = 1)
points(1:nrow(m1.tg)/nrow(m1.tg),m1.tg$cumsum,col = "blue",lwd = 1)
points(1:nrow(m2.tg)/nrow(m2.tg),m2.tg$cumsum,col = "dark blue",lwd = 1)
points(1:nrow(m3.tg)/nrow(m3.tg),m3.tg$cumsum,col = "green",lwd = 1)
points(1:nrow(m4.tg)/nrow(m4.tg),m4.tg$cumsum,col = "dark green",lwd = 1)
abline(h = 0.05, col = 'red')
text(0.2,0.05,"threshold for high output clones",srt=0.1,pos=3, col = "red",cex = 0.7)

wt.highoutputclones <- c(
    m1.wt %>% filter(cumsum > 0.05) %>% nrow(),
    m2.wt %>% filter(cumsum > 0.05) %>% nrow(),
    m3.wt %>% filter(cumsum > 0.05) %>% nrow(),
    m4.wt %>% filter(cumsum > 0.05) %>% nrow())


tg.highoutputclones <- c(
    m1.tg %>% filter(cumsum > 0.05) %>% nrow(),
    m2.tg %>% filter(cumsum > 0.05) %>% nrow(),
    m3.tg %>% filter(cumsum > 0.05) %>% nrow(),
    m4.tg %>% filter(cumsum > 0.05) %>% nrow())


boxplot(wt.highoutputclones,
        tg.highoutputclones ,outline = F,
        main = "High output clones in E: clonal diversity",
        ylab = "total number of clones per mouse",names = c("WT","Tg"), col = c("red","blue"))

#data is normally distributed
shapiro.test(c(wt.highoutputclones,
               tg.highoutputclones))
## 
##  Shapiro-Wilk normality test
## 
## data:  c(wt.highoutputclones, tg.highoutputclones)
## W = 0.9456, p-value = 0.6669
t.test(wt.highoutputclones,tg.highoutputclones)
## 
##  Welch Two Sample t-test
## 
## data:  wt.highoutputclones and tg.highoutputclones
## t = -0.33491, df = 4.6363, p-value = 0.7523
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -46.51546  36.01546
## sample estimates:
## mean of x mean of y 
##     60.00     65.25
wt.highoutputclones <- unlist(c(
    m1.wt %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m2.wt %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m3.wt %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m4.wt %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts))))


tg.highoutputclones <- unlist(c(
    m1.tg %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m2.tg %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m3.tg %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts)),
    m4.tg %>% filter(cumsum > 0.05) %>% summarise(median = median(cell_counts))))

#data is normally distributed

boxplot(wt.highoutputclones,
        tg.highoutputclones ,outline = F,
        main = "High output clones in E: clone sizes",
        ylab = "total number of clones per mouse",names = c("WT","Tg"), col = c("red","blue"))

shapiro.test(c(wt.highoutputclones,
               tg.highoutputclones))
## 
##  Shapiro-Wilk normality test
## 
## data:  c(wt.highoutputclones, tg.highoutputclones)
## W = 0.77251, p-value = 0.01444
wilcox.test(wt.highoutputclones,tg.highoutputclones)
## 
##  Wilcoxon rank sum exact test
## 
## data:  wt.highoutputclones and tg.highoutputclones
## W = 8, p-value = 1
## alternative hypothesis: true location shift is not equal to 0